In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff
import cufflinks as cf
cf.go_offline()
In [2]:
bureau = pd.read_csv('bureau.csv')
bureau_balance = pd.read_csv('bureau_balance.csv')
In [2]:
#Load the excel files.
app_train = pd.read_csv('application_train.csv')
app_test = pd.read_csv('application_test.csv')
bureau = pd.read_csv('bureau.csv')
bureau_balance = pd.read_csv('bureau_balance.csv')
credit_card = pd.read_csv('credit_card_balance.csv')
installments = pd.read_csv('installments_payments.csv')
pos_cash = pd.read_csv('POS_CASH_balance.csv')
previous_app = pd.read_csv('previous_application.csv')
In [3]:
#Let's take a look at the initial columns in the train data
app_train.head()
Out[3]:
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY ... FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
0 100002 1 Cash loans M N Y 0 202500.0 406597.5 24700.5 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0
1 100003 0 Cash loans F N N 0 270000.0 1293502.5 35698.5 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
2 100004 0 Revolving loans M Y Y 0 67500.0 135000.0 6750.0 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
3 100006 0 Cash loans F N Y 0 135000.0 312682.5 29686.5 ... 0 0 0 0 NaN NaN NaN NaN NaN NaN
4 100007 0 Cash loans M N Y 0 121500.0 513000.0 21865.5 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 122 columns

In [4]:
#Check the shape of the dataset
print('the shape of the train data is:',app_train.shape)
print('the shape of the test data is:',app_test.shape)
print('the shape of the bureau data is:',bureau.shape)
print('the shape of the bureau_balance data is:',bureau_balance.shape)
print('the shape of the credit card data is:',credit_card.shape)
print('the shape of the installments data is:',installments.shape)
print('the shape of the pos cash data is:',pos_cash.shape)
print('the shape of the previous_application data is:',previous_app.shape)
the shape of the train data is: (307511, 122)
the shape of the test data is: (48744, 121)
the shape of the bureau data is: (1716428, 17)
the shape of the bureau_balance data is: (27299925, 3)
the shape of the credit card data is: (3840312, 23)
the shape of the installments data is: (13605401, 8)
the shape of the pos cash data is: (10001358, 8)
the shape of the previous_application data is: (1670214, 37)

The train dataset has about 307,500 rows and 122 columns. Working on a dataset with these many columns is not easy and working on individual columns in redundant.

Let's write functions to analyze the data. The major areas of analysis are missing values, data type conversions and handling continuous/discrete variables.

In [3]:
#Lets check the missing values and percentage of missing values
def missing_values(df):
    miss_values = df.isnull().sum()
    per_miss_values = 100*df.isnull().sum()/len(df)
    #Create a table to show these 2 values
    table = pd.concat([miss_values,per_miss_values],axis=1)
    table = table.rename(columns = {0:'Missing Values', 1:'% of Total values'})
    table = table[table.iloc[:,1]!=0].sort_values('% of Total values', ascending = False)
    return table

missing_train = missing_values(app_train)
missing_test = missing_values(app_test)
missing_bureau = missing_values(bureau)
missing_bureau_balance = missing_values(bureau_balance)
missing_creditcard = missing_values(credit_card)
missing_installments = missing_values(installments)
missing_poscash = missing_values(pos_cash)
missing_previous_app = missing_values(previous_app)
In [6]:
#Lets get on with exploring the data. We will start off with a few attributes from the train data and gradually include attributes
#from other datasets
##Lets check the distribution of the target.
count = app_train['TARGET'].value_counts()
fr = pd.DataFrame({'labels':count.index, 'values':count.values})
fr.iplot(kind='pie', labels = 'labels', values = 'values', title = 'Distribution of the target variable')
In [7]:
#Its a very imbalanced problem. ~92% of the people have repaid their loans on time.
#Separate the people whove repaid loans from the people who have not..
app_train['TARGET'].dtypes
repaid = app_train[app_train['TARGET']==0]
non_repaid = app_train[app_train['TARGET']==1]
In [8]:
#Let's check the trend of the time and date of loan application.
count0 = repaid['WEEKDAY_APPR_PROCESS_START'].value_counts()
count1 = non_repaid['WEEKDAY_APPR_PROCESS_START'].value_counts()

data1 = go.Bar(x=count0.index, y=count0.values, name='Loan repayers')
data2 = go.Bar(x=count1.index, y=count1.values, name='Loan non-repayers')

data = [data1,data2]
layout = go.Layout(barmode = 'group')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='barplot')
#-----------------------------------------------------------------------------

count2 = repaid['HOUR_APPR_PROCESS_START'].value_counts().sort_index()
count3 = non_repaid['HOUR_APPR_PROCESS_START'].value_counts().sort_index()

trace0 = go.Scatter(x=count2.index, y=count2.values, mode='lines+markers', name='Loan repayers')
trace1 = go.Scatter(x=count3.index, y=count3.values, mode='lines+markers', name='Loan non repayers')

trace = [trace0,trace1]
py.iplot(trace, filename='Time distribution')
In [9]:
#There seems to be no significant relationship/insights between the target variable and the time and date of applications filed.
#Let's get on with the analysis of loans.
count = app_train['NAME_CONTRACT_TYPE'].value_counts()/len(app_train)
fig1 = [go.Bar(x=count.index, y=count.values, name='Types of loans distribution')]
layout1 = go.Layout(title = 'Types of loans distribution')
fig_1 = go.Figure(data=fig1, layout=layout1)
py.iplot(fig_1, filename='simple bar')
#-----------------------------------------------------------------------------------------------------------

#Almost 90% of the loans are cash loans. Now within each type of loan what are the percentages of loan repayers and non-repayers.
count = app_train['NAME_CONTRACT_TYPE'].value_counts()
count_0 =[]
count_1 =[]
for i in count.index:
    count_0.append(np.sum(app_train['TARGET'][app_train['NAME_CONTRACT_TYPE']==i]==0))
    count_1.append(np.sum(app_train['TARGET'][app_train['NAME_CONTRACT_TYPE']==i]==1))
    
trace1 = go.Bar(x=count.index, y=count_0/count.sum(), name = '0')
trace2 = go.Bar(x=count.index, y=count_1/count.sum(), name = '1')

data = [trace1,trace2]
layout = go.Layout(title = 'Loan types distribution in terms of repayment')

figure = go.Figure(data=data, layout=layout)
py.iplot(figure, filename='loan plot')
In [10]:
#Let's consider the gender of the applicants. The gender breakdown and the relationship of each gender with loan payment/non payment
gender = app_train['CODE_GENDER'].value_counts()
gd = pd.DataFrame({'labels':gender.index, 'values':gender.values})
gd.iplot(kind='pie', labels='labels', values='values', title = 'distribution of the gender variable')

#--------------------------------------------------------------------------------------------------
male = app_train[app_train['CODE_GENDER']=='M']['TARGET'].value_counts()
female = app_train[app_train['CODE_GENDER']=='F']['TARGET'].value_counts()
fig = {'data':[{'labels':male.index, 'values':male.values, 'type':'pie', 'hoverinfo':'label+percent+name', 'text':'Male', 'hole':0.6, 'domain':{'x':[0,0.48]}},
               {'labels':female.index, 'values':female.values, 'type':'pie', 'hoverinfo':'label+percent+name', 'text':'Female', 'hole':0.6, 'domain':{'x':[0.52,1]}}
              ],
      'layout': {'title': 'Gender Breakdown in terms of loans applied',
               'showlegend': True, 
               "annotations": [
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Male",
                "x": 0.19,
                "y": 0.50
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Female",
                "x": 0.81,
                "y": 0.50
            }
                ]
      }
      }
py.iplot(fig, filename='gender donut')

#-----------------------------------------------------------------------------------
#Gender distribution in terms of loans(cash vs revolving loans)
temp1 = app_train[app_train['NAME_CONTRACT_TYPE']=='Cash loans']['CODE_GENDER'].value_counts()
temp2 = app_train[app_train['NAME_CONTRACT_TYPE']=='Revolving loans']['CODE_GENDER'].value_counts()

fig = {'data':[{'labels':temp1.index, 'values':temp1.values, 'type':'pie', 'hole':0.6, 'domain':{'x':[0,0.48]}, 'hoverinfo':'label+percent+name'},
               {'labels':temp2.index, 'values':temp2.values, 'type':'pie', 'hole':0.6, 'domain':{'x':[0.52,1]}, 'hoverinfo':'label+percent+name'}
              ], 
       'layout': {'title': 'Gender Distribution in terms of types of loan applied',
               'showlegend': True, 
               "annotations": [
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Cash Loans",
                "x": 0.16,
                "y": 0.50
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Revolving Loans",
                "x": 0.86,
                "y": 0.50
            }
                                ]
        
                }
      }
py.iplot(fig, filename='loan gender plot')
In [65]:
#Analyze the client attributes. Whether the client has a vehicle, house or/children
vehicle = app_train['FLAG_OWN_CAR'].value_counts()
house = app_train['FLAG_OWN_REALTY'].value_counts()
child = app_train['CNT_CHILDREN'].value_counts()

fig = {
    'data': [
        {
            'labels': vehicle.index,
            'values': vehicle.values,
            'type': 'pie',
            'name': 'Vehicle',
            'domain': {'x': [0, .48],
                       'y': [.52, 1]},
            'hoverinfo':'label+percent+name',
            'text':'Vehicle Owned',
            'textposition':'inside',
            'hole':0.5
        },
        {
            'labels': house.index,
            'values': house.values,
            'type': 'pie',
            'name': 'House',
            'domain': {'x': [.52, 1],
                       'y': [.52, 1]},
            'hoverinfo':'label+percent+name',
            'text':'Vehicle Owned',
            'textposition':'inside',
            'hole':0.5

        },
        {
            'labels': child.index,
            'values': child.values,
            'type': 'pie',
            'name': 'Children',
            'domain': {'x': [.33, .67],
                       'y': [0, .48]},
            'hoverinfo':'label+percent+name',
            'text':'Vehicle Owned',
            'textposition':'inside',
            'hole':0.5
        }
    ],
    'layout': {'title': 'Breakdown of Loan Applicants',
               'showlegend': False, 
               "annotations": [
            {
                "font": {
                    "size": 10
                },
                "showarrow": False,
                "text": "VehicleOwned",
                "x": 0.19,
                "y": 0.77
            },
            {
                "font": {
                    "size": 10
                },
                "showarrow": False,
                "text": "HouseOwners",
                "x": 0.81,
                "y": 0.78
            },
            {
                "font": {
                    "size": 10
                },
                "showarrow": False,
                "text": "Children",
                "x": 0.50,
                "y": 0.23
            }
        ]
              }
}
py.iplot(fig, filename='donut')
In [93]:
#Do ownership of vehicles and houses affect the repayment of the loan?Let's check it out
vehicle = app_train['FLAG_OWN_CAR'].value_counts()
count_0 = []
count_1 = []

for i in vehicle.index:
    count_0.append(np.sum(app_train['TARGET'][app_train['FLAG_OWN_CAR']==i]==0))
    count_1.append(np.sum(app_train['TARGET'][app_train['FLAG_OWN_CAR']==i]==1))
    
trace1 = go.Bar(x=vehicle.index, y=count_0/vehicle.sum(), name='0')
trace2 = go.Bar(x=vehicle.index, y=count_1/vehicle.sum(), name='1')

data = [trace1,trace2]
layout = go.Layout(title = 'Vehicle Owner distribution in terms of repayment')

figure = go.Figure(data=data, layout=layout)
py.iplot(figure, filename='vehicle plot')

#-------------------------------------------------------------------------------
house = app_train['FLAG_OWN_REALTY'].value_counts()
count_0 = []
count_1 = []

for i in house.index:
    count_0.append(np.sum(app_train['TARGET'][app_train['FLAG_OWN_REALTY']==i]==0))
    count_1.append(np.sum(app_train['TARGET'][app_train['FLAG_OWN_REALTY']==i]==1))
    
trace1 = go.Bar(x=house.index, y=count_0/house.sum(), name='0')
trace2 = go.Bar(x=house.index, y=count_1/house.sum(), name='1')

trace=[trace1,trace2]
layout = go.Layout(title = 'Loan repayment probability among house owners')

fig = go.Figure(data=trace, layout=layout)
py.iplot(fig, filename='House loans')
In [123]:
#How does number of children affect the loan repayment? 
ch = app_train['CNT_CHILDREN'].value_counts().sort_index()
df = pd.DataFrame({'Children':ch.index, 'Count':ch.values})
df.iplot(kind='pie', labels='Children', values='Count', title = 'Children distribution among loan applicants')
#child = pd.DataFrame(app_train['CNT_CHILDREN'].value_counts().sort_index())
#child1 = child.reset_index()
#child1.columns = ['Children','Count']
#child1

count0 = []
count1 = []

for i in ch.index:
    count0.append(np.sum(app_train['TARGET'][app_train['CNT_CHILDREN']==i]==0))
    count1.append(np.sum(app_train['TARGET'][app_train['CNT_CHILDREN']==i]==1))
    
trace1 = go.Bar(x=ch.index, y=count0/ch.sum()*100, name='0')
trace2 = go.Bar(x=ch.index, y=count1/ch.sum()*100, name='1')

trace = [trace1, trace2]
layout = go.Layout(title = 'Number of children vs loan repayment')

fig = go.Figure(data=trace, layout=layout)
py.iplot(fig, filename='Child plots')
In [149]:
count0 =[]
count1 =[]
for i in ch.index:
    count0.append(np.sum(app_train['TARGET'][app_train['CNT_CHILDREN']==i]==0))
    count1.append(np.sum(app_train['TARGET'][app_train['CNT_CHILDREN']==i]==1))

child_data = pd.DataFrame({'Children':ch.index, 'Count':ch.values, 'Count_0':count0, 'Count_1':count1})
child_data
Out[149]:
Children Count Count_0 Count_1
0 0 215371 198762 16609
1 1 61119 55665 5454
2 2 26749 24416 2333
3 3 3717 3359 358
4 4 429 374 55
5 5 84 77 7
6 6 21 15 6
7 7 7 7 0
8 8 2 2 0
9 9 2 0 2
10 10 2 2 0
11 11 1 0 1
12 12 2 2 0
13 14 3 3 0
14 19 2 2 0
In [162]:
income = [f for f in app_train['AMT_INCOME_TOTAL']]
credit = [i for i in app_train['AMT_CREDIT']]
annuity = [j for j in app_train['AMT_ANNUITY']]
goods = [k for k in app_train['AMT_GOODS_PRICE']]

trace1 = go.Histogram(x=credit, opacity = 0.5)

trace_1 = [trace1]

lay_1 = go.Layout(title = 'Distribution of Credit of loan applicants', xaxis=dict(title='Credit'),yaxis=dict(title='No.of Applicants'))
figu_1 = go.Figure(data=trace_1, layout = lay_1)

py.iplot(figu_1, filename = 'continuous plots')
#---------------------------------------------------------------------------

trace2 = go.Histogram(x=annuity, opacity = 0.5)
trace_2 = [trace2]
lay_2 = go.Layout(title = 'Distribution of Annuity of loan applicants', xaxis=dict(title='Annuity'),yaxis=dict(title='No.of Applicants'))
figu_2 = go.Figure(data=trace_2, layout = lay_2)
py.iplot(figu_2, filename = 'continuous plots')

#---------------------------------------------------------------------------
trace3 = go.Histogram(x=goods, opacity = 0.5, xbins=dict(size = 1))
trace_3 = [trace3]
lay_3 = go.Layout(title = 'Distribution of Goods Amount of loan applicants', xaxis=dict(title='Goods'),yaxis=dict(title='No.of Applicants'))
figu_3 = go.Figure(data=trace_3, layout = lay_3)
py.iplot(figu_3, filename = 'continuous plots')
In [155]:
#Another way to plot these continuous vaariables would be to create bins of suitable sizes. 
#This can be done by finding the 25th, 50th and 75th percentile of the variables.
quar = app_train['AMT_CREDIT'].quantile([0.25, 0.5, 0.75, 1])
quar
Out[155]:
0.25     270000.0
0.50     513531.0
0.75     808650.0
1.00    4050000.0
Name: AMT_CREDIT, dtype: float64
In [168]:
#Who was accompanying the client?
client = app_train['NAME_TYPE_SUITE'].value_counts()
cl = pd.DataFrame({'labels':client.index, 'values':client.values})
cl.iplot(kind = 'pie', labels = 'labels', values = 'values', title = 'Distribution of people accompanying the loan applicant')
#------------------------------------------------------------------------------

#Do the people accompanying the client have an influence on loan application.
count0 = []
count1 = []
for i in client.index:
    count0.append(np.sum(app_train['TARGET'][app_train['NAME_TYPE_SUITE']==i]==0))
    count1.append(np.sum(app_train['TARGET'][app_train['NAME_TYPE_SUITE']==i]==1))
    
tr1 = go.Bar(x=client.index, y=count0/client.values, name='0')
tr2 = go.Bar(x=client.index, y=count1/client.values, name='1')

tr = [tr1,tr2]
la = go.Layout(title = 'Effect of Client personnel on Loan applications', xaxis = dict(title = 'Personnel'), yaxis = dict(title = '% of Applications'))

fi = go.Figure(data=tr, layout=la)
py.iplot(fi, filename='personnel plots')
In [187]:
#The income type of the applicant
inc = app_train['NAME_INCOME_TYPE'].value_counts()
it = pd.DataFrame({'labels':inc.index, 'values':inc.values})
it.iplot(kind='pie', labels = 'labels', values = 'values', title = 'Income type of the applicant')
#--------------------------------------------------------------------------------------------------
count0 = app_train[app_train['TARGET']==0]['NAME_INCOME_TYPE'].value_counts()
count1 = app_train[app_train['TARGET']==1]['NAME_INCOME_TYPE'].value_counts()

fig = {'data':[{'type':'pie', 'labels':count0.index, 'values':count0.values, 'hole': 0.5, 'hoverinfo':'label+percent+name', 'domain':{'x':[0,0.48]}},
               {'type':'pie', 'labels':count1.index, 'values':count1.values, 'hole': 0.5, 'hoverinfo':'label+percent+name', 'domain':{'x':[0.52,1]}}
              ],
       'layout':{'title': 'Breakdown of Income type of loan applicants',
               "showlegend": True, 
               "annotations": [
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Loans Repaid",
                "x": 0.15,
                "y": 0.50
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Loans Unpaid",
                "x": 0.85,
                "y": 0.50
            }
           
       ]
}}
py.iplot(fig, filename='pieliot')
In [190]:
#Highest education of the client
edu = app_train['NAME_EDUCATION_TYPE'].value_counts()
c0 = []
c1 = []
for i in edu.index:
    c0.append(np.sum(app_train['TARGET'][app_train['NAME_EDUCATION_TYPE']==i]==0))
    c1.append(np.sum(app_train['TARGET'][app_train['NAME_EDUCATION_TYPE']==i]==1))
    
tr1 = go.Bar(x=edu.index, y=c0/edu.values, name='0')
tr2 = go.Bar(x=edu.index, y=c1/edu.values, name='1')

tr = [tr1,tr2]
layo = go.Layout(title = 'Education distribution with repect to paid vs unpaid loans', barmode='stack')

fi = go.Figure(data=tr, layout=layo)
py.iplot(fi, filename = 'lkih')
In [192]:
#family status of the client
fam = app_train['NAME_FAMILY_STATUS'].value_counts()
c0 = []
c1 = []
for i in fam.index:
    c0.append(np.sum(app_train['TARGET'][app_train['NAME_FAMILY_STATUS']==i]==0))
    c1.append(np.sum(app_train['TARGET'][app_train['NAME_FAMILY_STATUS']==i]==1))
    
tr1 = go.Bar(x=fam.index, y=c0/fam.values, name='0')
tr2 = go.Bar(x=fam.index, y=c1/fam.values, name='1')

tr = [tr1,tr2]
layo = go.Layout(title = 'Family Status with respect to paid vs unpaid loans', barmode = 'stack')

fi = go.Figure(data=tr, layout=layo)
py.iplot(fi, filename = 'lkih')
In [193]:
#HOUSe status of the client
house = app_train['NAME_HOUSING_TYPE'].value_counts()
c0 = []
c1 = []
for i in house.index:
    c0.append(np.sum(app_train['TARGET'][app_train['NAME_HOUSING_TYPE']==i]==0))
    c1.append(np.sum(app_train['TARGET'][app_train['NAME_HOUSING_TYPE']==i]==1))
    
tr1 = go.Bar(x=house.index, y=c0/house.values, name='0')
tr2 = go.Bar(x=house.index, y=c1/house.values, name='1')

tr = [tr1,tr2]
layo = go.Layout(title = 'House Status with respect to paid vs unpaid loans', barmode = 'stack')

fi = go.Figure(data=tr, layout=layo)
py.iplot(fi, filename = 'lkih')
In [225]:
#Organisation type
org = app_train['ORGANIZATION_TYPE'].value_counts()
og = pd.DataFrame({'labels':org.index, 'values':org.values})
tr = [go.Bar(x=org.index, y=org.values)]
la = go.Layout(title = 'Distribution of Organisation type among loan applicants', xaxis = dict(tickangle = -45, tickfont = dict(size=10)))
fig = go.Figure(data = tr, layout = la)
py.iplot(fig, filename='jghgh')

#-------------------------------------------------------------
#the top and bottom in terms of loan repayment
repay = app_train[app_train['TARGET']==0]['ORGANIZATION_TYPE'].value_counts()
top = repay[:10]
non_repay = app_train[app_train['TARGET']==1]['ORGANIZATION_TYPE'].value_counts()
topn = non_repay[:10]

fig = {'data':[{'type':'pie', 'labels':top.index, 'values':top.values, 'hole': 0.5, 'hoverinfo':'label+percent+name', 'domain':{'x':[0,0.48]}},
               {'type':'pie', 'labels':topn.index, 'values':topn.values, 'hole': 0.5, 'hoverinfo':'label+percent+name', 'domain':{'x':[0.52,1]}}
              ],
       'layout':{'title': 'Top loan payers and defaulters',
               "showlegend": True, 
               "annotations": [
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Loan Payers",
                "x": 0.15,
                "y": 0.50
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Loan Defaulters",
                "x": 0.87,
                "y": 0.50
            }
           
       ]
}}
py.iplot(fig, filename='pieliot')
In [74]:
#Let's plot the correlation between the attributes and the target
cor = app_train.corr()['TARGET'].sort_values()
print('The top positive correlations are \n{}'.format(cor.tail(25)))
print('\nthe top negative correlations are \n{}'.format(cor.head(25)))
The top positive correlations are 
FLAG_DOCUMENT_21               0.003709
FLAG_DOCUMENT_2                0.005417
REG_REGION_NOT_LIVE_REGION     0.005576
REG_REGION_NOT_WORK_REGION     0.006942
OBS_60_CNT_SOCIAL_CIRCLE       0.009022
OBS_30_CNT_SOCIAL_CIRCLE       0.009131
CNT_FAM_MEMBERS                0.009308
CNT_CHILDREN                   0.019187
AMT_REQ_CREDIT_BUREAU_YEAR     0.019930
FLAG_WORK_PHONE                0.028524
DEF_60_CNT_SOCIAL_CIRCLE       0.031276
DEF_30_CNT_SOCIAL_CIRCLE       0.032248
LIVE_CITY_NOT_WORK_CITY        0.032518
OWN_CAR_AGE                    0.037612
DAYS_REGISTRATION              0.041975
FLAG_DOCUMENT_3                0.044346
REG_CITY_NOT_LIVE_CITY         0.044395
FLAG_EMP_PHONE                 0.045982
REG_CITY_NOT_WORK_CITY         0.050994
DAYS_ID_PUBLISH                0.051457
DAYS_LAST_PHONE_CHANGE         0.055218
REGION_RATING_CLIENT           0.058899
REGION_RATING_CLIENT_W_CITY    0.060893
DAYS_BIRTH                     0.078239
TARGET                         1.000000
Name: TARGET, dtype: float64

the top negative correlations are 
EXT_SOURCE_3                 -0.178919
EXT_SOURCE_2                 -0.160472
EXT_SOURCE_1                 -0.155317
DAYS_EMPLOYED                -0.044932
FLOORSMAX_AVG                -0.044003
FLOORSMAX_MEDI               -0.043768
FLOORSMAX_MODE               -0.043226
AMT_GOODS_PRICE              -0.039645
REGION_POPULATION_RELATIVE   -0.037227
ELEVATORS_AVG                -0.034199
ELEVATORS_MEDI               -0.033863
FLOORSMIN_AVG                -0.033614
FLOORSMIN_MEDI               -0.033394
LIVINGAREA_AVG               -0.032997
LIVINGAREA_MEDI              -0.032739
FLOORSMIN_MODE               -0.032698
TOTALAREA_MODE               -0.032596
ELEVATORS_MODE               -0.032131
LIVINGAREA_MODE              -0.030685
AMT_CREDIT                   -0.030369
APARTMENTS_AVG               -0.029498
APARTMENTS_MEDI              -0.029184
FLAG_DOCUMENT_6              -0.028602
APARTMENTS_MODE              -0.027284
LIVINGAPARTMENTS_AVG         -0.025031
Name: TARGET, dtype: float64
In [17]:
#Let us consider top 3 positively correlated values
#Days_birth is in negative values. Lets consider the positive values and check the distribution of these ages
print('The ages range between {} years and {} years'.format(round(-(app_train['DAYS_BIRTH'].max())/365), round(-(app_train['DAYS_BIRTH'].min())/365)))
#--------------------------------------------------------------------------------------------------------
age_years = round(-(app_train['DAYS_BIRTH']/365))
age = [a for a in age_years]
d = [go.Histogram(x=age)]
l = go.Layout(title = 'Age Distribution of Loan Applicants', xaxis = dict(title ='Age'), yaxis = dict(title = 'Frequency'))
f = go.Figure(data=d, layout = l)
py.iplot(f, filename='HGTTHHHH')
#-------------------------------------------------------------------------------------------------------
#Lets divide the age into buckets and see the age categories with respect to loans paid and unpaid.
app_train['DAYS_BIRTH'] = age_years
app_train['DAYS_BIRTH']
app = app_train.groupby(['DAYS_BIRTH','TARGET'])['TARGET'].count().unstack().reset_index()
app = app.rename(columns = {'DAYS_BIRTH':'AGE_IN_YEARS', 0:'target0', 1:'target1'})

t0 = go.Bar(x=app['AGE_IN_YEARS'], y=app['target0'])
l0 = go.Scatter(x=app['AGE_IN_YEARS'], y=app['target0'])
t_0 = [t0,l0]
la0 = go.Layout(title = 'Loans paid', xaxis = dict(title = 'Age'), yaxis = dict(title = 'Frequency'))
f0 = go.Figure(data = t_0, layout = la0)
py.iplot(f0, filename = 'ghjek')

t1 = go.Bar(x=app['AGE_IN_YEARS'], y=app['target1'])
l1 = go.Scatter(x=app['AGE_IN_YEARS'], y=app['target1'])
t_1 = [t1,l1]
la1 = go.Layout(title = 'Loans Unpaid', xaxis = dict(title = 'Age'), yaxis = dict(title = 'Frequency'))
f1 = go.Figure(data = t_1, layout = la1)
py.iplot(f1, filename = 'ghjek')
The ages range between 21.0 years and 69.0 years
In [18]:
#Region rating client w city
app_train['REGION_RATING_CLIENT_W_CITY'].value_counts()
#reg = app_train.groupby(['REGION_RATING_CLIENT_W_CITY','TARGET'])['TARGET'].count().unstack().reset_index()
Out[18]:
2    229484
3     43860
1     34167
Name: REGION_RATING_CLIENT_W_CITY, dtype: int64
In [48]:
#The top 3 negative correlations are ext_source1, 2 and 3. Let's plot them against the target variable.
ext1 = app_train[['EXT_SOURCE_1', 'TARGET']].dropna()
ext2 = app_train[['EXT_SOURCE_2', 'TARGET']].dropna()
ext3 = app_train[['EXT_SOURCE_3', 'TARGET']].dropna()
In [50]:
import seaborn as sns
plt.figure(figsize=(12,8))
sns.kdeplot(ext1['EXT_SOURCE_1'])
sns.kdeplot(ext2['EXT_SOURCE_2'])
sns.kdeplot(ext3['EXT_SOURCE_3'])
Out[50]:
<matplotlib.axes._subplots.AxesSubplot at 0x2bc631b2be0>
In [58]:
plt.figure(figsize=(12,8))
sns.kdeplot(ext1[ext1['TARGET']==0]['EXT_SOURCE_1'])
sns.kdeplot(ext1[ext1['TARGET']==1]['EXT_SOURCE_1'])
sns.kdeplot(ext2[ext2['TARGET']==0]['EXT_SOURCE_2'])
sns.kdeplot(ext2[ext2['TARGET']==1]['EXT_SOURCE_2'])
sns.kdeplot(ext3[ext3['TARGET']==0]['EXT_SOURCE_3'])
sns.kdeplot(ext3[ext3['TARGET']==1]['EXT_SOURCE_3'])
Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0x2bbf6db6320>
In [72]:
#Done with the application_train data. Let's move on to the bureau and bureau_balance data
#We can merge the bureau and bureau_balance tables
In [3]:
print('the shape of the bureau data set is {}'.format(bureau.shape))
print('the number of unique ids in bureau data set are {}'.format(bureau['SK_ID_BUREAU'].nunique()))
print('the shape of the bureau_balance data set is {}'.format(bureau_balance.shape))
print('the number of unique ids in bureau data set are {}'.format(bureau_balance['SK_ID_BUREAU'].nunique()))
the shape of the bureau data set is (1716428, 17)
the number of unique ids in bureau data set are 1716428
the shape of the bureau_balance data set is (27299925, 3)
the number of unique ids in bureau data set are 817395
In [4]:
bureau_new_inner = bureau.merge(bureau_balance, how = 'inner', left_on='SK_ID_BUREAU', right_on='SK_ID_BUREAU').dropna()
In [5]:
bureau_new_inner_drop = bureau.merge(bureau_balance, how = 'inner', left_on='SK_ID_BUREAU', right_on='SK_ID_BUREAU')
In [6]:
bureau_new_left = bureau.merge(bureau_balance, how = 'left', left_on='SK_ID_BUREAU', right_on='SK_ID_BUREAU').dropna()
In [15]:
bureau_new_left_drop = bureau.merge(bureau_balance, how = 'left', left_on='SK_ID_BUREAU', right_on='SK_ID_BUREAU')
In [8]:
print('the shape of left join without dropna is {}'.format(bureau_new_left_drop.shape))
print('the shape of left join with drop is {}'.format(bureau_new_left.shape))
print('the shape of inner join without drop is {}'.format(bureau_new_inner_drop.shape))
print('the shape of inner join with drop is {}'.format(bureau_new_inner.shape))
the shape of left join without dropna is (25121815, 19)
the shape of left join with drop is (980326, 19)
the shape of inner join without drop is (24179741, 19)
the shape of inner join with drop is (980326, 19)
In [26]:
bureau_new_left_drop.SK_ID_CURR.nunique()
Out[26]:
305811
In [101]:
bureau_new_left_drop.isnull().sum()
Out[101]:
SK_ID_CURR                       0
SK_ID_BUREAU                     0
CREDIT_ACTIVE                    0
CREDIT_CURRENCY                  0
DAYS_CREDIT                      0
CREDIT_DAY_OVERDUE               0
DAYS_CREDIT_ENDDATE        1232569
DAYS_ENDDATE_FACT          5978831
AMT_CREDIT_MAX_OVERDUE    18130741
CNT_CREDIT_PROLONG               0
AMT_CREDIT_SUM                  13
AMT_CREDIT_SUM_DEBT        4234392
AMT_CREDIT_SUM_LIMIT      10671361
AMT_CREDIT_SUM_OVERDUE           0
CREDIT_TYPE                      0
DAYS_CREDIT_UPDATE               0
AMT_ANNUITY               10495530
MONTHS_BALANCE              942074
STATUS                      942074
dtype: int64
In [9]:
print(bureau_new_left.SK_ID_CURR.nunique())
print(app_train.SK_ID_CURR.nunique())
print(credit_card.SK_ID_CURR.nunique())
print(pos_cash.SK_ID_CURR.nunique())
print(installments.SK_ID_CURR.nunique())
print(previous_app.SK_ID_CURR.nunique())
24987
307511
103558
337252
339587
338857
In [31]:
from matplotlib_venn import venn2
plt.figure(figsize=(10,7))
venn2([set(app_train.SK_ID_CURR.unique()), set(bureau_new_left.SK_ID_CURR.unique())], set_labels = ('Application train', 'Combined Bureau') )
plt.title('Number of common ids between Application train and bureau combined data')
plt.show()
In [32]:
from matplotlib_venn import venn2
plt.figure(figsize=(10,7))
venn2([set(app_train.SK_ID_CURR.unique()), set(credit_card.SK_ID_CURR.unique())], set_labels = ('Application train', 'Credit Card') )
plt.title('Number of common ids between Application train and Credit Card data')
plt.show()
In [33]:
from matplotlib_venn import venn2
plt.figure(figsize=(10,7))
venn2([set(app_train.SK_ID_CURR.unique()), set(pos_cash.SK_ID_CURR.unique())], set_labels = ('Application train', 'POS Cash') )
plt.title('Number of common ids between Application train and POS Cash data')
plt.show()
In [34]:
from matplotlib_venn import venn2
plt.figure(figsize=(10,7))
venn2([set(app_train.SK_ID_CURR.unique()), set(installments.SK_ID_CURR.unique())], set_labels = ('Application train', 'Installments') )
plt.title('Number of common ids between Application train and installments data')
plt.show()
In [35]:
from matplotlib_venn import venn2
plt.figure(figsize=(10,7))
venn2([set(app_train.SK_ID_CURR.unique()), set(previous_app.SK_ID_CURR.unique())], set_labels = ('Application train', 'Previous App') )
plt.title('Number of common ids between Application train and Previous Application data')
plt.show()
In [19]:
#Let us analyze the bureau and the bureau balance data
print(bureau.shape)
print(bureau_balance.shape)

print(bureau.SK_ID_BUREAU.nunique())
print(bureau_balance.SK_ID_BUREAU.nunique())
(1716428, 17)
(27299925, 3)
1716428
817395
In [7]:
#Let's aggregate the bureau_balance column
bb_orig = bureau_balance.columns.tolist()
no_months = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].size().reset_index()
first = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].min().tolist()
last = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].max().tolist()
counts = bureau_balance.groupby('SK_ID_BUREAU')['STATUS'].value_counts().unstack().reset_index()

no_months['first_month'] = first
no_months['last_month'] = last

no_months1 = no_months.merge(counts, on='SK_ID_BUREAU', how='left')

bureau_balance = no_months1
bb_new = [c for c in bureau_balance.columns if c not in bb_orig]
In [8]:
bureau_balance.head()
Out[8]:
SK_ID_BUREAU MONTHS_BALANCE first_month last_month 0 1 2 3 4 5 C X
0 5001709 97 -96 0 NaN NaN NaN NaN NaN NaN 86.0 11.0
1 5001710 83 -82 0 5.0 NaN NaN NaN NaN NaN 48.0 30.0
2 5001711 4 -3 0 3.0 NaN NaN NaN NaN NaN NaN 1.0
3 5001712 19 -18 0 10.0 NaN NaN NaN NaN NaN 9.0 NaN
4 5001713 22 -21 0 NaN NaN NaN NaN NaN NaN NaN 22.0
In [10]:
#Let's aggregate the bureau column
#Drop the currency attribute as it is 99.9% the same value1
bureau.drop(columns=['CREDIT_CURRENCY'], axis=1, inplace=True)
#Separate the categorical colmuns and dummify them.
orig_col = bureau.columns.tolist()
num_cols = [col for col in bureau.columns if bureau[col].dtype!='object']
cat_cols = [col for col in bureau.columns if bureau[col].dtype=='object']
bureau = pd.get_dummies(bureau, columns=cat_cols)
bureau.head()
Out[10]:
SK_ID_CURR SK_ID_BUREAU DAYS_CREDIT CREDIT_DAY_OVERDUE DAYS_CREDIT_ENDDATE DAYS_ENDDATE_FACT AMT_CREDIT_MAX_OVERDUE CNT_CREDIT_PROLONG AMT_CREDIT_SUM AMT_CREDIT_SUM_DEBT ... CREDIT_TYPE_Interbank credit CREDIT_TYPE_Loan for business development CREDIT_TYPE_Loan for purchase of shares (margin lending) CREDIT_TYPE_Loan for the purchase of equipment CREDIT_TYPE_Loan for working capital replenishment CREDIT_TYPE_Microloan CREDIT_TYPE_Mobile operator loan CREDIT_TYPE_Mortgage CREDIT_TYPE_Real estate loan CREDIT_TYPE_Unknown type of loan
0 215354 5714462 -497 0 -153.0 -153.0 NaN 0 91323.0 0.0 ... 0 0 0 0 0 0 0 0 0 0
1 215354 5714463 -208 0 1075.0 NaN NaN 0 225000.0 171342.0 ... 0 0 0 0 0 0 0 0 0 0
2 215354 5714464 -203 0 528.0 NaN NaN 0 464323.5 NaN ... 0 0 0 0 0 0 0 0 0 0
3 215354 5714465 -203 0 NaN NaN NaN 0 90000.0 NaN ... 0 0 0 0 0 0 0 0 0 0
4 215354 5714466 -629 0 1197.0 NaN 77674.5 0 2700000.0 NaN ... 0 0 0 0 0 0 0 0 0 0

5 rows × 33 columns

In [11]:
#Let's join bureau and bureau_balnce datasets
bur_agg = bureau.merge(bureau_balance, on='SK_ID_BUREAU', how='left')
In [18]:
bur_agg.MONTHS_BALANCE.isnull().sum()
Out[18]:
942074
In [19]:
num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum']
    }
new_cols = [c for c in bur_agg.columns if c not in orig_col]
cat_aggregations = {}
for col in new_cols: cat_aggregations[col] = ['mean']
    
bureau = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2524             try:
-> 2525                 return self._engine.get_loc(key)
   2526             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'MONTHS_BALANCE'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-19-7629e6667204> in <module>()
     16 for col in new_cols: cat_aggregations[col] = ['mean']
     17 
---> 18 bureau = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})

~\Anaconda3\lib\site-packages\pandas\core\groupby.py in aggregate(self, arg, *args, **kwargs)
   4289         versionadded=''))
   4290     def aggregate(self, arg, *args, **kwargs):
-> 4291         return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)
   4292 
   4293     agg = aggregate

~\Anaconda3\lib\site-packages\pandas\core\groupby.py in aggregate(self, arg, *args, **kwargs)
   3722 
   3723         _level = kwargs.pop('_level', None)
-> 3724         result, how = self._aggregate(arg, _level=_level, *args, **kwargs)
   3725         if how is None:
   3726             return result

~\Anaconda3\lib\site-packages\pandas\core\base.py in _aggregate(self, arg, *args, **kwargs)
    476 
    477                 try:
--> 478                     result = _agg(arg, _agg_1dim)
    479                 except SpecificationError:
    480 

~\Anaconda3\lib\site-packages\pandas\core\base.py in _agg(arg, func)
    427                 result = compat.OrderedDict()
    428                 for fname, agg_how in compat.iteritems(arg):
--> 429                     result[fname] = func(fname, agg_how)
    430                 return result
    431 

~\Anaconda3\lib\site-packages\pandas\core\base.py in _agg_1dim(name, how, subset)
    406                 aggregate a 1-dim with how
    407                 """
--> 408                 colg = self._gotitem(name, ndim=1, subset=subset)
    409                 if colg.ndim != 1:
    410                     raise SpecificationError("nested dictionary is ambiguous "

~\Anaconda3\lib\site-packages\pandas\core\groupby.py in _gotitem(self, key, ndim, subset)
   4316         elif ndim == 1:
   4317             if subset is None:
-> 4318                 subset = self.obj[key]
   4319             return SeriesGroupBy(subset, selection=key,
   4320                                  grouper=self.grouper)

~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2137             return self._getitem_multilevel(key)
   2138         else:
-> 2139             return self._getitem_column(key)
   2140 
   2141     def _getitem_column(self, key):

~\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
   2144         # get column
   2145         if self.columns.is_unique:
-> 2146             return self._get_item_cache(key)
   2147 
   2148         # duplicate columns & possible reduce dimensionality

~\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
   1840         res = cache.get(item)
   1841         if res is None:
-> 1842             values = self._data.get(item)
   1843             res = self._box_item_values(item, values)
   1844             cache[item] = res

~\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
   3841 
   3842             if not isna(item):
-> 3843                 loc = self.items.get_loc(item)
   3844             else:
   3845                 indexer = np.arange(len(self.items))[isna(self.items)]

~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2525                 return self._engine.get_loc(key)
   2526             except KeyError:
-> 2527                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2528 
   2529         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'MONTHS_BALANCE'
In [14]:
new_cols
Out[14]:
['CREDIT_ACTIVE_Active',
 'CREDIT_ACTIVE_Bad debt',
 'CREDIT_ACTIVE_Closed',
 'CREDIT_ACTIVE_Sold',
 'CREDIT_TYPE_Another type of loan',
 'CREDIT_TYPE_Car loan',
 'CREDIT_TYPE_Cash loan (non-earmarked)',
 'CREDIT_TYPE_Consumer credit',
 'CREDIT_TYPE_Credit card',
 'CREDIT_TYPE_Interbank credit',
 'CREDIT_TYPE_Loan for business development',
 'CREDIT_TYPE_Loan for purchase of shares (margin lending)',
 'CREDIT_TYPE_Loan for the purchase of equipment',
 'CREDIT_TYPE_Loan for working capital replenishment',
 'CREDIT_TYPE_Microloan',
 'CREDIT_TYPE_Mobile operator loan',
 'CREDIT_TYPE_Mortgage',
 'CREDIT_TYPE_Real estate loan',
 'CREDIT_TYPE_Unknown type of loan',
 'MONTHS_BALANCE',
 'first_month',
 'last_month',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 'C',
 'X']